ENT_TYPE = {
    "企业":"COMP",
    "时间":"TIME",
    "产品":"PROD",
    "地点":"LOC",
    "人物":"PER"
}

raw_file = "raw04.txt"
ann_file = "raw04.ann"

f1 = open(raw_file, 'r',encoding='utf-8', newline="")
f2 = open(ann_file, 'r', encoding='utf-8')

ann_item = []
for line in f2.readlines():
    _, middle, string = line.strip().split("\t")
    entity_type, start, end = middle.split()
    start = int(start)
    end = int(end)
    ann_item.append([ENT_TYPE[entity_type], start, end, string])
f2.close()

text = f1.read()
f1.close()

tags = ['O'] * len(text)
for i in range(len(ann_item)):
    for j in range(ann_item[i][1], ann_item[i][2]):
        tags[j] = "I-" + ann_item[i][0]
    tags[ann_item[i][1]] = "B-" + ann_item[i][0]
        
from sklearn.utils import shuffle

data = []    
chars = [] 
labels = [] 
for i in range(len(text)):
    char = text[i]
    if char not in ['\r', '\n']:
        if char == " ":
            char = "-"
        chars.append(char)
        labels.append(tags[i])
        
    if char == '\n':
        data.append([chars, labels])
        chars = []
        labels = []

data = shuffle(data)
L = len(data)
n_train, n_dev = int(L * 0.8), int(L * 0.1)
n_test = L - n_train - n_dev

train_file = open("data.train", 'a+', encoding='utf-8')
dev_file = open("data.dev", 'a+', encoding='utf-8')
test_file = open("data.test", 'a+', encoding='utf-8')

train_data = data[:n_train]
dev_data = data[n_train:(n_train+n_dev)]
test_data = data[-n_test:]

for X,Y in train_data:
    for x,y in zip(X,Y):
        train_file.write(x + ' ' + y + '\n')
    train_file.write('\n')

for X,Y in dev_data:
    for x,y in zip(X,Y):
        dev_file.write(x + ' ' + y + '\n')
    dev_file.write('\n')

for X,Y in test_data:
    for x,y in zip(X,Y):
        test_file.write(x + ' ' + y + '\n')
    test_file.write('\n')



